Global setup

library(tidyverse)
library(edgeR)
library(AnnotationHub)
library(magrittr)
library(scales)
library(GenomicRanges)
library(cqn)
library(fgsea)
library(kableExtra)
if (interactive()) setwd(here::here("R"))
theme_set(theme_bw())
nCores <- min(12, detectCores() - 1)

Gene GC content and length object

# # Get GC content and lengths
# gcTrans <- url("https://uofabioinformaticshub.github.io/Ensembl_GC/Release94/Danio_rerio.GRCz11.94.rds") %>%
#   readRDS()
# # Convert to the gene-level with average length, average GC, max length, and GC of longest transcript for all transcripts assigned to a single gene.
# gcGene <- gcTrans %>%
#   split(f = .$gene_id) %>%
#   mclapply(function(x){
#     gr <- reduce(x)
#     df <- DataFrame(
#       gene_id = unique(x$gene_id),
#       gene_symbol = unique(x$gene_symbol),
#       aveLength = round(mean(x$length),0) %>% as.integer(),
#       aveGc = sum(x$length * x$gc) / sum(x$length),
#       maxLength = max(x$length),
#       longestGc = x$gc[which.max(x$length)[[1]]]
#     )
#     mcols(gr) <- df
#     gr
#   }, mc.cores = 4) %>%
#   GRangesList() %>%
#   unlist() %>%
#   na.omit()
# # This takes a reasonable amount of time
# # Save as .rds file for faster loading
# saveRDS("../files/gcGene.rds")
gcGene <- readRDS("../files/gcGene.rds")

GSEA ID conversion and pathways

# Load id conversion file
idConvert <- read_csv2("../files/zf2human_withEntrezIDs.csv") %>%
  dplyr::select(Geneid = zfID, EntrezID = Entrez) %>%
  mutate(EntrezID = as.character(EntrezID))
# Create function to convert ids
convertHsEG2Dr <- function(ids, df = idConvert){
  dplyr::filter(df, EntrezID %in% ids)$Geneid
}
# Conversion of zebrafish ensembl ID to zebrafish symbol, for plotting on network analyses
idConvertSymbol <- read_csv2("../files/zf2human_withEntrezIDs.csv") %>%
  dplyr::select(label = zfID, symbol = zfName) %>%
  na.omit() %>%
  unique()
# Import hallmark human gene genesets and tidy the gene set names
# .gmt file downloaded from:
# http://software.broadinstitute.org/gsea/downloads.jsp 
hallmark <- gmtPathways("../files/h.all.v6.2.entrez.gmt") %>%
  mclapply(convertHsEG2Dr, mc.cores = 4) %>%
  set_names(str_remove_all(names(.), "HALLMARK_"))

2019 data

Load data

# Load counts analysed by feature counts
counts_19 <- read_tsv("../2_alignedData/featureCounts/genes.out") %>%
  set_colnames(basename(colnames(.))) %>%
  set_colnames(str_remove(colnames(.), "Aligned.sortedByCoord.out.bam")) %>%
  dplyr::select(Geneid, starts_with("W"), starts_with("Q"))

Create DGEList

# Create DGEList and calculate normalisaton factors
dgeList_19 <- counts_19 %>%
  as.data.frame() %>%
  column_to_rownames("Geneid") %>%
  DGEList() %>%
  calcNormFactors()
# Set group variable
dgeList_19$samples$group <- colnames(dgeList_19) %>%
  str_extract("(W|Q)") %>%
  factor(levels = c("W", "Q"))

Add gene information

# Add AnnotationHub and subset to search for zebrafish
ah <- AnnotationHub()
ah %>%
  subset(species == "Danio rerio") %>%
  subset(dataprovider == "Ensembl") %>%
  subset(rdataclass == "EnsDb")
## AnnotationHub with 12 records
## # snapshotDate(): 2019-10-29 
## # $dataprovider: Ensembl
## # $species: Danio rerio
## # $rdataclass: EnsDb
## # additional mcols(): taxonomyid, genome, description,
## #   coordinate_1_based, maintainer, rdatadateadded, preparerclass, tags,
## #   rdatapath, sourceurl, sourcetype 
## # retrieve records with, e.g., 'object[["AH53201"]]' 
## 
##             title                           
##   AH53201 | Ensembl 87 EnsDb for Danio Rerio
##   AH53705 | Ensembl 88 EnsDb for Danio Rerio
##   AH56671 | Ensembl 89 EnsDb for Danio Rerio
##   AH57746 | Ensembl 90 EnsDb for Danio Rerio
##   AH60762 | Ensembl 91 EnsDb for Danio Rerio
##   ...       ...                             
##   AH64906 | Ensembl 94 EnsDb for Danio rerio
##   AH67932 | Ensembl 95 EnsDb for Danio rerio
##   AH69169 | Ensembl 96 EnsDb for Danio rerio
##   AH73861 | Ensembl 97 EnsDb for Danio rerio
##   AH74989 | Ensembl 98 EnsDb for Danio rerio
# Select correct Ensembl release
ensDb <- ah[["AH64906"]]
# Extract GenomicRanges object from ensDb
genesGR <- genes(ensDb)
# Remove redundant columns from mcols
mcols(genesGR) <- mcols(genesGR)[c("gene_id", "gene_name", 
                                   "gene_biotype", "entrezid")] %>%
  as.data.frame() %>%
  dplyr::select(-entrezid) %>%
  left_join(as.data.frame(mcols(gcGene))) %>%
  distinct(gene_id, .keep_all = TRUE) %>%
  set_rownames(.$gene_id) %>%
  DataFrame() %>%
  .[names(genesGR),]
# Add genesGR to DGEList using rownames of DGEList to reorder the genesGR
dgeList_19$genes <- genesGR[rownames(dgeList_19),]

Data QC

# Create logical vector of genes to keep that fit criteria
genes2keep_19 <- dgeList_19 %>%
  cpm() %>%
  is_greater_than(1) %>%
  rowSums() %>%
  is_weakly_greater_than(4)
# Create new DGEList of genes fitting criteria
dgeFilt_19 <- dgeList_19[genes2keep_19,, keep.lib.sizes = FALSE] %>%
  calcNormFactors()

Without cqn

Differential expression

# Create model matrix
design_19 <- model.matrix(~group, data = dgeFilt_19$samples)
# Perform DE test
topTable_19 <- estimateGLMCommonDisp(dgeFilt_19) %>%
  glmFit(design_19) %>%
  glmLRT(coef=2) %>%
  topTags(n = Inf) %>%
  .$table %>%
  as_tibble() %>%
  unite("Range", ID.start, ID.end, sep = "-") %>%
  unite("Location", ID.seqnames, Range, ID.strand, sep = ":") %>%
  set_colnames(str_remove(colnames(.), "ID.")) %>%
  dplyr::select(
    Geneid = gene_id, 
    Symbol = gene_name,
    AveExpr = logCPM, 
    logFC, 
    P.Value = PValue, 
    FDR, 
    Location,
    aveLength,
    aveGc,
    maxLength,
    longestGc
  ) %>%
  mutate(
    DE = FDR < 0.05,
    RankStat = -sign(logFC)*log10(P.Value)
  )
if (!interactive()) {
  topTable_19 %>%
    dplyr::select(Geneid, Symbol, AveExpr, logFC, P.Value, FDR) %>%
    dplyr::slice(1:20) %>%
    mutate(
      P.Value = sprintf("%.2e", P.Value),
      FDR = sprintf("%.2e", FDR)
    ) %>%
    kable(caption = "The 20 most differentially expressed genes") %>%
    kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"))
}
The 20 most differentially expressed genes
Geneid Symbol AveExpr logFC P.Value FDR
ENSDARG00000091368 AL954327.1 2.656650 -5.906385 1.45e-98 2.67e-94
ENSDARG00000111860 exorh 3.993233 3.405441 1.63e-74 1.49e-70
ENSDARG00000009637 rcvrn3 2.423242 3.635172 4.41e-68 2.70e-64
ENSDARG00000111827 rcvrnb 3.200713 3.212106 3.16e-63 1.45e-59
ENSDARG00000059163 rbp3 2.234735 3.296390 3.12e-57 1.14e-53
ENSDARG00000098249 asmt 2.165764 3.257285 9.65e-56 2.95e-52
ENSDARG00000002768 pvalb2 2.238580 -3.378321 4.41e-50 1.16e-46
ENSDARG00000093753 BX004774.2 2.161002 2.809252 2.75e-44 6.32e-41
ENSDARG00000067990 myhz1.1 2.622203 -2.876345 9.63e-43 1.96e-39
ENSDARG00000053254 mylpfa 2.393426 -2.878776 3.33e-41 6.11e-38
ENSDARG00000040565 ckmb 2.632848 -2.787234 8.11e-41 1.35e-37
ENSDARG00000024877 ptgr1 4.624935 2.330074 1.38e-40 2.11e-37
ENSDARG00000035327 ckma 2.350054 -2.799236 2.80e-39 3.95e-36
ENSDARG00000075963 mhc1uba 5.918828 2.189626 1.24e-37 1.62e-34
ENSDARG00000099197 actc1b 3.642893 -2.441845 3.51e-37 4.30e-34
ENSDARG00000017624 krt4 3.777492 -2.393940 2.15e-36 2.47e-33
ENSDARG00000017246 prx 2.325670 -2.619766 2.00e-35 2.16e-32
ENSDARG00000024789 mxc 1.817686 2.471397 4.40e-34 4.48e-31
ENSDARG00000100397 pde6c 1.332726 2.436494 7.25e-30 7.01e-27
ENSDARG00000035438 myhc4 1.653235 -2.433943 8.48e-28 7.78e-25

GC content

topTable_19 %>%
  dplyr::arrange(desc(P.Value)) %>%
  dplyr::filter(!is.na(aveGc)) %>%
  ggplot(aes(aveGc, logFC)) +
  geom_point(aes(colour = DE), alpha = 0.5, show.legend = FALSE) +
  geom_smooth(se = FALSE) + 
  labs(
    x = "GC Content", 
    title = "2019 data without cqn"
  ) +
  scale_color_manual(values = c("grey70", "red"))
Comparison of logFC and GC content before using cqn

Comparison of logFC and GC content before using cqn

topTable_19 %>%
  dplyr::arrange(desc(P.Value)) %>%
  dplyr::filter(!is.na(aveGc)) %>%
  ggplot(aes(aveGc, RankStat)) +
  geom_point(aes(colour = DE), alpha = 0.5, show.legend = FALSE) +
  geom_smooth(se = FALSE) + 
  labs(
    x = "GC Content", 
    y = "Ranking Statistic", 
    title = "2019 data without cqn"
  ) +
  coord_cartesian(ylim = c(-10, 10)) +
  scale_color_manual(values = c("grey70", "red"))
Comparison of ranking statistics and GC content before using cqn

Comparison of ranking statistics and GC content before using cqn

Gene length

topTable_19 %>%
  dplyr::arrange(desc(P.Value)) %>%
  dplyr::filter(!is.na(aveGc)) %>%
  ggplot(aes(aveLength, logFC)) +
  geom_point(aes(colour = DE), alpha = 0.5, show.legend = FALSE) +
  geom_smooth(se = FALSE) + 
  labs(
    x = "Gene length",
    title = "2019 data without cqn"
  ) +
  scale_color_manual(values = c("grey70", "red")) +
  scale_x_log10(labels = comma)

topTable_19 %>%
  dplyr::arrange(desc(P.Value)) %>%
  dplyr::filter(!is.na(aveGc)) %>%
  ggplot(aes(aveLength, RankStat)) +
  geom_point(aes(colour = DE), alpha = 0.5, show.legend = FALSE) +
  geom_smooth(se = FALSE) + 
  labs(
    x = "Gene Length", 
    y = "Ranking Statistic",
    title = "2019 data without cqn"
  ) +
  coord_cartesian(ylim = c(-10, 10)) +
  scale_color_manual(values = c("grey70", "red")) +
  scale_x_log10(labels = comma)

GSEA

# Create named vector of gene level statistics 
ranks_19 <- topTable_19 %>%
  dplyr::arrange(RankStat) %>%
  with(structure(RankStat, names = Geneid))
# Run GSEA
fgsea_19 <- fgsea(hallmark, ranks_19, nperm=1e5) %>%
  as_tibble() %>%
  dplyr::rename(FDR = padj) %>%
  mutate(bonferroni = p.adjust(pval, "bonferroni")) %>%
  dplyr::arrange(pval)
if (!interactive()) {
  fgsea_19 %>%
    dplyr::select(-leadingEdge) %>%
    dplyr::slice(1:10) %>%
    kable(caption = "The 10 most significantly enriched pathways") %>%
    kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"))
}
The 10 most significantly enriched pathways
pathway pval FDR ES NES nMoreExtreme size bonferroni
INTERFERON_GAMMA_RESPONSE 0.0000197 0.0004921 0.7628697 2.247079 0 200 0.0009842
ALLOGRAFT_REJECTION 0.0000197 0.0004921 0.7663245 2.253349 0 195 0.0009842
MYOGENESIS 0.0001425 0.0023756 -0.7025634 -2.153505 6 216 0.0071267
INTERFERON_ALPHA_RESPONSE 0.0045548 0.0505899 0.7494870 2.028404 225 83 0.2277399
INFLAMMATORY_RESPONSE 0.0050590 0.0505899 0.6412325 1.863712 255 173 0.2529494
OXIDATIVE_PHOSPHORYLATION 0.0201470 0.1678919 0.5569524 1.652094 1024 219 1.0000000
KRAS_SIGNALING_DN 0.0246021 0.1698544 -0.5742175 -1.702190 1220 155 1.0000000
COMPLEMENT 0.0271767 0.1698544 0.5554376 1.636422 1378 202 1.0000000
ESTROGEN_RESPONSE_EARLY 0.0381706 0.2030876 -0.5167204 -1.566930 1876 196 1.0000000
MYC_TARGETS_V1 0.0406175 0.2030876 0.5272316 1.564737 2067 220 1.0000000

With cqn

Run cqn

# Check which genes in dgeFilt have GC content and length in gcGene
genes2keep_19cqn <- rownames(dgeFilt_19) %in% mcols(gcGene)$gene_id
dgeFilt_19cqn <- dgeFilt_19[genes2keep_19cqn,, keep.lib.sizes = FALSE]
cqn_19 <- cqn(
  dgeFilt_19cqn$counts, 
  x = mcols(dgeFilt_19cqn$genes)$aveGc,
  lengths = mcols(dgeFilt_19cqn$genes)$aveLength,
  sizeFactors = dgeFilt_19cqn$samples$lib.size
)

Differential expression

dgeFilt_19cqn$offset <- cqn_19$glm.offset
design_19cqn <- model.matrix(~ group, data = dgeFilt_19cqn$samples)
topTable_19cqn <- estimateGLMCommonDisp(dgeFilt_19cqn) %>%
  glmFit(design_19cqn) %>%
  glmLRT(coef=2) %>%
  topTags(n = Inf) %>%
  .$table %>%
  as_tibble() %>%
  unite("Range", ID.start, ID.end, sep = "-") %>%
  unite("Location", ID.seqnames, Range, ID.strand, sep = ":") %>%
  set_colnames(str_remove(colnames(.), "ID.")) %>%
  dplyr::select(
    Geneid = gene_id, 
    Symbol = gene_name,
    AveExpr = logCPM, 
    logFC, 
    P.Value = PValue, 
    FDR, 
    Location,
    aveLength,
    aveGc,
    maxLength,
    longestGc
  ) %>%
  mutate(
    DE = FDR < 0.05,
    RankStat = -sign(logFC)*log10(P.Value)
  )
if (!interactive()) {
  topTable_19cqn %>%
    dplyr::select(Geneid, Symbol, AveExpr, logFC, P.Value, FDR) %>%
    dplyr::slice(1:20) %>%
    mutate(
      P.Value = sprintf("%.2e", P.Value),
      FDR = sprintf("%.2e", FDR)
    ) %>%
    kable(caption = "The 20 most differentially expressed genes") %>%
    kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"))
}
The 20 most differentially expressed genes
Geneid Symbol AveExpr logFC P.Value FDR
ENSDARG00000111860 exorh 4.022706 3.439359 2.75e-92 4.84e-88
ENSDARG00000009637 rcvrn3 2.455893 3.560549 9.24e-77 8.13e-73
ENSDARG00000111827 rcvrnb 3.231423 3.144133 6.92e-73 4.06e-69
ENSDARG00000098249 asmt 2.198911 3.333733 4.53e-68 1.99e-64
ENSDARG00000059163 rbp3 2.267500 3.302384 3.87e-67 1.36e-63
ENSDARG00000002768 pvalb2 2.263883 -3.395165 1.17e-58 3.42e-55
ENSDARG00000024877 ptgr1 4.651772 2.231490 5.25e-46 1.32e-42
ENSDARG00000053254 mylpfa 2.418812 -2.793595 1.32e-45 2.90e-42
ENSDARG00000075963 mhc1uba 5.946181 2.150825 4.69e-45 9.17e-42
ENSDARG00000040565 ckmb 2.658295 -2.676428 1.39e-44 2.45e-41
ENSDARG00000035327 ckma 2.375427 -2.680239 2.80e-42 4.48e-39
ENSDARG00000067990 myhz1.1 2.647735 -2.604647 7.86e-42 1.15e-38
ENSDARG00000099197 actc1b 3.668377 -2.325127 4.04e-41 5.47e-38
ENSDARG00000017624 krt4 3.802966 -2.289225 1.11e-40 1.39e-37
ENSDARG00000024789 mxc 1.847148 2.479242 8.69e-40 1.02e-36
ENSDARG00000017246 prx 2.350905 -2.402629 3.08e-35 3.38e-32
ENSDARG00000100397 pde6c 1.366511 2.472544 5.25e-34 5.43e-31
ENSDARG00000074345 si:ch211-214b16.4 3.466161 1.918083 1.45e-32 1.42e-29
ENSDARG00000068457 tnnt3b 1.919037 -2.226803 2.72e-29 2.52e-26
ENSDARG00000010729 CABZ01073795.1 3.744796 1.772594 4.87e-29 4.29e-26
# Check similarity of genes
topTable_19cqn$Geneid[topTable_19cqn$DE == TRUE] %in%
  topTable_19$Geneid[topTable_19$DE == TRUE] %>% 
  table()
## .
## FALSE  TRUE 
##    54   271

GC content

topTable_19cqn %>%
  dplyr::arrange(desc(P.Value)) %>%
  dplyr::filter(!is.na(aveGc)) %>%
  ggplot(aes(aveGc, logFC)) +
  geom_point(aes(colour = DE), alpha = 0.5, show.legend = FALSE) +
  geom_smooth(se = FALSE) + 
  labs(
    x = "GC Content",
    title = "2019 data with cqn"
  ) +
  scale_color_manual(values = c("grey70", "red"))

topTable_19cqn %>%
  dplyr::arrange(desc(P.Value)) %>%
  dplyr::filter(!is.na(aveGc)) %>%
  ggplot(aes(aveGc, RankStat)) +
  geom_point(aes(colour = DE), alpha = 0.5, show.legend = FALSE) +
  geom_smooth(se = FALSE) + 
  labs(
    x = "GC Content", 
    y = "Ranking Statistic",
    title = "2019 data with cqn"
  ) +
  coord_cartesian(ylim = c(-10, 10)) +
  scale_color_manual(values = c("grey70", "red"))

Gene length

topTable_19cqn %>%
  dplyr::arrange(desc(P.Value)) %>%
  dplyr::filter(!is.na(aveLength)) %>%
  ggplot(aes(aveLength, logFC)) +
  geom_point(aes(colour = DE), alpha = 0.5, show.legend = FALSE) +
  geom_smooth(se = FALSE) + 
  labs(
    x = "Gene length",
    title = "2019 data with cqn"
  ) +
  scale_color_manual(values = c("grey70", "red")) +
  scale_x_log10(labels = comma)

topTable_19cqn %>%
  dplyr::arrange(desc(P.Value)) %>%
  dplyr::filter(!is.na(aveLength)) %>%
  ggplot(aes(aveLength, RankStat)) +
  geom_point(aes(colour = DE), alpha = 0.5, show.legend = FALSE) +
  geom_smooth(se = FALSE) + 
  labs(
    x = "Gene Length", 
    y = "Ranking Statistic",
    title = "2019 data with cqn"
  ) +
  scale_color_manual(values = c("grey70", "red")) +
  coord_cartesian(ylim = c(-10, 10)) +
  scale_x_log10(labels = comma)

GSEA

# Create named vector of gene level statistics 
ranks_19cqn <- topTable_19cqn %>%
  dplyr::arrange(RankStat) %>%
  with(structure(RankStat, names = Geneid))
# Run GSEA
fgsea_19cqn <- fgsea(hallmark, ranks_19cqn, nperm=1e5) %>%
  as_tibble() %>%
  dplyr::rename(FDR = padj) %>%
  mutate(padj = p.adjust(pval, "bonferroni")) %>%
  dplyr::arrange(pval)
if (!interactive()) {
  fgsea_19cqn %>%
    dplyr::select(-leadingEdge) %>%
    dplyr::slice(1:10) %>%
    kable(caption = "The 10 most significantly enriched pathways") %>%
    kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"))
}
The 10 most significantly enriched pathways
pathway pval FDR ES NES nMoreExtreme size padj
MYOGENESIS 0.0000241 0.0008592 -0.7136154 -2.158272 0 216 0.0012046
INTERFERON_GAMMA_RESPONSE 0.0000344 0.0008592 0.7884000 2.219730 1 200 0.0017183
ALLOGRAFT_REJECTION 0.0001031 0.0017190 0.7611030 2.138609 5 195 0.0051571
INTERFERON_ALPHA_RESPONSE 0.0015861 0.0198259 0.8062983 2.099017 87 83 0.0793036
INFLAMMATORY_RESPONSE 0.0091171 0.0911707 0.6523203 1.813266 525 173 0.4558533
ESTROGEN_RESPONSE_EARLY 0.0181522 0.1512687 -0.5376487 -1.610285 758 196 0.9076125
KRAS_SIGNALING_DN 0.0262577 0.1875549 -0.5551186 -1.623564 1117 155 1.0000000
COMPLEMENT 0.0325202 0.2032512 0.5813006 1.637586 1892 202 1.0000000
ESTROGEN_RESPONSE_LATE 0.0517571 0.2589051 -0.4915559 -1.471478 2164 195 1.0000000
HEME_METABOLISM 0.0517810 0.2589051 -0.4915485 -1.471455 2165 195 1.0000000

Comparisons (for viewing)

Differential expression

Before cqn: The 20 most differentially expressed genes
Geneid Symbol AveExpr logFC P.Value FDR
ENSDARG00000091368 AL954327.1 2.656650 -5.906385 1.45e-98 2.67e-94
ENSDARG00000111860 exorh 3.993233 3.405441 1.63e-74 1.49e-70
ENSDARG00000009637 rcvrn3 2.423242 3.635172 4.41e-68 2.70e-64
ENSDARG00000111827 rcvrnb 3.200713 3.212106 3.16e-63 1.45e-59
ENSDARG00000059163 rbp3 2.234735 3.296390 3.12e-57 1.14e-53
ENSDARG00000098249 asmt 2.165764 3.257285 9.65e-56 2.95e-52
ENSDARG00000002768 pvalb2 2.238580 -3.378321 4.41e-50 1.16e-46
ENSDARG00000093753 BX004774.2 2.161002 2.809252 2.75e-44 6.32e-41
ENSDARG00000067990 myhz1.1 2.622203 -2.876345 9.63e-43 1.96e-39
ENSDARG00000053254 mylpfa 2.393426 -2.878776 3.33e-41 6.11e-38
ENSDARG00000040565 ckmb 2.632848 -2.787234 8.11e-41 1.35e-37
ENSDARG00000024877 ptgr1 4.624935 2.330074 1.38e-40 2.11e-37
ENSDARG00000035327 ckma 2.350054 -2.799236 2.80e-39 3.95e-36
ENSDARG00000075963 mhc1uba 5.918828 2.189626 1.24e-37 1.62e-34
ENSDARG00000099197 actc1b 3.642893 -2.441845 3.51e-37 4.30e-34
ENSDARG00000017624 krt4 3.777492 -2.393940 2.15e-36 2.47e-33
ENSDARG00000017246 prx 2.325670 -2.619766 2.00e-35 2.16e-32
ENSDARG00000024789 mxc 1.817686 2.471397 4.40e-34 4.48e-31
ENSDARG00000100397 pde6c 1.332726 2.436494 7.25e-30 7.01e-27
ENSDARG00000035438 myhc4 1.653235 -2.433943 8.48e-28 7.78e-25
After cqn: The 20 most differentially expressed genes
Geneid Symbol AveExpr logFC P.Value FDR
ENSDARG00000111860 exorh 4.022706 3.439359 2.75e-92 4.84e-88
ENSDARG00000009637 rcvrn3 2.455893 3.560549 9.24e-77 8.13e-73
ENSDARG00000111827 rcvrnb 3.231423 3.144133 6.92e-73 4.06e-69
ENSDARG00000098249 asmt 2.198911 3.333733 4.53e-68 1.99e-64
ENSDARG00000059163 rbp3 2.267500 3.302384 3.87e-67 1.36e-63
ENSDARG00000002768 pvalb2 2.263883 -3.395165 1.17e-58 3.42e-55
ENSDARG00000024877 ptgr1 4.651772 2.231490 5.25e-46 1.32e-42
ENSDARG00000053254 mylpfa 2.418812 -2.793595 1.32e-45 2.90e-42
ENSDARG00000075963 mhc1uba 5.946181 2.150825 4.69e-45 9.17e-42
ENSDARG00000040565 ckmb 2.658295 -2.676428 1.39e-44 2.45e-41
ENSDARG00000035327 ckma 2.375427 -2.680239 2.80e-42 4.48e-39
ENSDARG00000067990 myhz1.1 2.647735 -2.604647 7.86e-42 1.15e-38
ENSDARG00000099197 actc1b 3.668377 -2.325127 4.04e-41 5.47e-38
ENSDARG00000017624 krt4 3.802966 -2.289225 1.11e-40 1.39e-37
ENSDARG00000024789 mxc 1.847148 2.479242 8.69e-40 1.02e-36
ENSDARG00000017246 prx 2.350905 -2.402629 3.08e-35 3.38e-32
ENSDARG00000100397 pde6c 1.366511 2.472544 5.25e-34 5.43e-31
ENSDARG00000074345 si:ch211-214b16.4 3.466161 1.918083 1.45e-32 1.42e-29
ENSDARG00000068457 tnnt3b 1.919037 -2.226803 2.72e-29 2.52e-26
ENSDARG00000010729 CABZ01073795.1 3.744796 1.772594 4.87e-29 4.29e-26

GC Content

Gene length

GSEA

Before cqn: The 10 most significantly enriched pathways
pathway pval FDR ES NES nMoreExtreme size bonferroni
INTERFERON_GAMMA_RESPONSE 0.0000197 0.0004921 0.7628697 2.247079 0 200 0.0009842
ALLOGRAFT_REJECTION 0.0000197 0.0004921 0.7663245 2.253349 0 195 0.0009842
MYOGENESIS 0.0001425 0.0023756 -0.7025634 -2.153505 6 216 0.0071267
INTERFERON_ALPHA_RESPONSE 0.0045548 0.0505899 0.7494870 2.028404 225 83 0.2277399
INFLAMMATORY_RESPONSE 0.0050590 0.0505899 0.6412325 1.863712 255 173 0.2529494
OXIDATIVE_PHOSPHORYLATION 0.0201470 0.1678919 0.5569524 1.652094 1024 219 1.0000000
KRAS_SIGNALING_DN 0.0246021 0.1698544 -0.5742175 -1.702190 1220 155 1.0000000
COMPLEMENT 0.0271767 0.1698544 0.5554376 1.636422 1378 202 1.0000000
ESTROGEN_RESPONSE_EARLY 0.0381706 0.2030876 -0.5167204 -1.566930 1876 196 1.0000000
MYC_TARGETS_V1 0.0406175 0.2030876 0.5272316 1.564737 2067 220 1.0000000
After cqn: The 10 most significantly enriched pathways
pathway pval FDR ES NES nMoreExtreme size padj
MYOGENESIS 0.0000241 0.0008592 -0.7136154 -2.158272 0 216 0.0012046
INTERFERON_GAMMA_RESPONSE 0.0000344 0.0008592 0.7884000 2.219730 1 200 0.0017183
ALLOGRAFT_REJECTION 0.0001031 0.0017190 0.7611030 2.138609 5 195 0.0051571
INTERFERON_ALPHA_RESPONSE 0.0015861 0.0198259 0.8062983 2.099017 87 83 0.0793036
INFLAMMATORY_RESPONSE 0.0091171 0.0911707 0.6523203 1.813266 525 173 0.4558533
ESTROGEN_RESPONSE_EARLY 0.0181522 0.1512687 -0.5376487 -1.610285 758 196 0.9076125
KRAS_SIGNALING_DN 0.0262577 0.1875549 -0.5551186 -1.623564 1117 155 1.0000000
COMPLEMENT 0.0325202 0.2032512 0.5813006 1.637586 1892 202 1.0000000
ESTROGEN_RESPONSE_LATE 0.0517571 0.2589051 -0.4915559 -1.471478 2164 195 1.0000000
HEME_METABOLISM 0.0517810 0.2589051 -0.4915485 -1.471455 2165 195 1.0000000

2017 data

Setup conversion object

# Create object for converting transcripts to genes
transGR <- transcripts(ensDb)
mcols(transGR) <- mcols(transGR)[c("tx_id", "gene_id")]
trans2Gene <- tibble(
  tx_id = transGR$tx_id,
  gene_id = transGR$gene_id
)

Load data

# Load counts from 2017 data obtained as a .rds from Nhi
counts_17 <- read_rds("nhiData/6month_Normoxia.rds")

Create DGEList

# Create DGEList
dgeList_17 <- counts_17$counts %>%
  as.data.frame() %>%
  rownames_to_column("tx_id") %>%
  as_tibble() %>%
  set_colnames(basename(colnames(.))) %>%
  mutate(tx_id = str_remove_all(tx_id, "\\.[0-9]*$")) %>%
  left_join(trans2Gene) %>%
  group_by(gene_id) %>%
  summarise_if(
    .predicate = is.numeric,
    .funs = sum
  ) %>%
  as.data.frame() %>%
  column_to_rownames("gene_id") %>%
  round() %>%
  set_colnames(str_remove(colnames(.), "[0-9]_MORGAN_6")) %>%
  set_colnames(str_remove(colnames(.), "PN")) %>%
  set_colnames(str_remove(colnames(.), "_[RS].+")) %>%
  set_colnames(str_replace(colnames(.), "P", "W")) %>%
  DGEList(
    group = colnames(.) %>%
      str_replace_all("([WQ])(.+)", "\\1") %>%
      str_replace_all("W", "WT") %>%
      str_replace_all("Q", "Mut") %>%
      factor(levels = c("WT", "Mut")),
    genes = genesGR[rownames(.)]
  ) %>%
  calcNormFactors() 

Data QC

# Create logical vector of genes to keep that fit criteria
genes2keep_17 <- dgeList_17 %>%
  cpm() %>%
  is_greater_than(1) %>%
  rowSums() %>%
  is_weakly_greater_than(4)
# Create new DGEList of genes fitting criteria
dgeFilt_17 <- dgeList_17[genes2keep_17,, keep.lib.sizes = FALSE] %>%
  calcNormFactors()

Without cqn

Differential expression

# Create model matrix
design_17 <- model.matrix(~group, data = dgeFilt_17$samples)
# Perform DE test
topTable_17 <-  estimateGLMCommonDisp(dgeFilt_17) %>%
  glmFit(design_17) %>%
  glmLRT(coef=2) %>%
  topTags(n = Inf) %>%
  .$table %>%
  as_tibble() %>%
  unite("Range", start, end, sep = "-") %>%
  unite("Location", seqnames, Range, strand, sep = ":") %>%
  set_colnames(str_remove(colnames(.), "ID.")) %>%
  dplyr::select(
    Geneid = gene_id, 
    Symbol = gene_name,
    AveExpr = logCPM, 
    logFC, 
    P.Value = PValue, 
    FDR, 
    Location,
    aveLength,
    aveGc,
    maxLength,
    longestGc
  ) %>%
  mutate(
    DE = FDR < 0.05,
    RankStat = -sign(logFC)*log10(P.Value)
  )
if (!interactive()) {
  topTable_17 %>%
    dplyr::select(Geneid, Symbol, AveExpr, logFC, P.Value, FDR) %>%
    dplyr::slice(1:20) %>%
    mutate(
      P.Value = sprintf("%.2e", P.Value),
      FDR = sprintf("%.2e", FDR)
    ) %>%
    kable(caption = "The 20 most differentially expressed genes") %>%
    kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"))
}
The 20 most differentially expressed genes
Geneid Symbol AveExpr logFC P.Value FDR
ENSDARG00000080675 si:dkey-71b5.7 3.0881824 -3.097472 2.28e-63 4.22e-59
ENSDARG00000058371 krt5 4.6741362 2.449728 8.45e-59 7.83e-55
ENSDARG00000112702 BX537263.3 7.3996536 -2.243755 1.88e-56 1.16e-52
ENSDARG00000061585 cyp4v7 2.3191654 -3.139064 5.07e-52 2.35e-48
ENSDARG00000087290 si:ch211-202h22.10 0.6296277 5.591881 2.94e-50 1.09e-46
ENSDARG00000036830 krt91 3.9384441 2.120057 1.22e-42 3.76e-39
ENSDARG00000074653 si:ch211-233m11.1 2.6910751 -2.277264 6.31e-36 1.67e-32
ENSDARG00000093531 slc37a4b 0.4139975 3.306116 9.71e-28 2.25e-24
ENSDARG00000031952 mb 2.5216783 1.889320 4.65e-27 9.57e-24
ENSDARG00000036773 s100s 2.1504571 1.812137 6.43e-23 1.19e-19
ENSDARG00000053480 aqp9b 1.7942745 1.755679 2.08e-19 3.51e-16
ENSDARG00000096564 CU915827.1 1.2875096 1.737070 2.47e-16 3.82e-13
ENSDARG00000045367 tuba1b 6.6404569 -1.126187 2.79e-16 3.98e-13
ENSDARG00000091734 traf3ip2l 3.0840975 -1.308960 7.46e-16 9.88e-13
ENSDARG00000069093 col2a1a 2.6124607 1.350510 1.10e-15 1.36e-12
ENSDARG00000020581 otofb 2.0502321 1.453107 2.21e-15 2.57e-12
ENSDARG00000095578 si:dkey-222h21.7 0.4110680 2.109993 4.60e-15 5.02e-12
ENSDARG00000092604 si:ch211-15j1.4 4.7446966 1.067221 7.73e-14 7.96e-11
ENSDARG00000101598 si:ch211-215p11.1 2.4986432 1.257070 1.49e-13 1.45e-10
ENSDARG00000096273 si:dkey-3n22.9 1.0162751 1.601951 7.98e-13 7.29e-10

GC content

topTable_17 %>%
  dplyr::arrange(desc(P.Value)) %>%
  dplyr::filter(!is.na(aveGc)) %>%
  ggplot(aes(aveGc, logFC)) +
  geom_point(aes(colour = DE), alpha = 0.5, show.legend = FALSE) +
  geom_smooth(se = FALSE) + 
  labs(
    x = "GC Content",
    title = "2017 data without cqn"
  ) +
  scale_color_manual(values = c("grey70", "red"))
Comparison of logFC and GC content before using cqn

Comparison of logFC and GC content before using cqn

topTable_17 %>%
  dplyr::arrange(desc(P.Value)) %>%
  dplyr::filter(!is.na(aveGc)) %>%
  ggplot(aes(aveGc, RankStat)) +
  geom_point(aes(colour = DE), alpha = 0.5, show.legend = FALSE) +
  geom_smooth(se = FALSE) + 
  labs(
    x = "GC Content", 
    y = "Ranking Statistic",
    title = "2017 data without cqn"
  ) +
  coord_cartesian(ylim = c(-10, 10)) +
  scale_color_manual(values = c("grey70", "red"))
Comparison of ranking statistics and GC content before using cqn

Comparison of ranking statistics and GC content before using cqn

Gene length

topTable_17 %>%
  dplyr::arrange(desc(P.Value)) %>%
  dplyr::filter(!is.na(aveLength)) %>%
  ggplot(aes(aveLength, logFC)) +
  geom_point(aes(colour = DE), alpha = 0.5, show.legend = FALSE) +
  geom_smooth(se = FALSE) + 
  labs(
    x = "Gene length",
    title = "2017 data without cqn"
  ) +
  scale_color_manual(values = c("grey70", "red")) +
  scale_x_log10(labels = comma)
Comparison of logFC and GC content before using cqn

Comparison of logFC and GC content before using cqn

topTable_17 %>%
  dplyr::arrange(desc(P.Value)) %>%
  dplyr::filter(!is.na(aveLength)) %>%
  ggplot(aes(aveLength, RankStat)) +
  geom_point(aes(colour = DE), alpha = 0.5, show.legend = FALSE) +
  geom_smooth(se = FALSE) + 
  labs(
    x = "Gene Length", 
    y = "Ranking Statistic",
    title = "2017 data without cqn"
  ) +
  coord_cartesian(ylim = c(-10, 10)) +
  scale_color_manual(values = c("grey70", "red")) +
  scale_x_log10(labels = comma)
Comparison of ranking statistics and GC content before using cqn

Comparison of ranking statistics and GC content before using cqn

GSEA

# Create named vector of gene level statistics 
ranks_17 <- topTable_17 %>%
  dplyr::arrange(RankStat) %>%
  with(structure(RankStat, names = Geneid))
# Run GSEA
fgsea_17 <- fgsea(hallmark, ranks_17, nperm=1e5) %>%
  as_tibble() %>%
  dplyr::rename(FDR = padj) %>%
  mutate(bonferroni = p.adjust(pval, "bonferroni")) %>%
  dplyr::arrange(pval)
if (!interactive()) {
  fgsea_17 %>%
    dplyr::select(-leadingEdge) %>%
    dplyr::slice(1:10) %>%
    kable(caption = "The 10 most significantly enriched pathways") %>%
    kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"))
}
The 10 most significantly enriched pathways
pathway pval FDR ES NES nMoreExtreme size bonferroni
ANGIOGENESIS 0.0127794 0.3806018 0.7426339 1.888033 701 33 0.6389718
MTORC1_SIGNALING 0.0295921 0.3806018 -0.4814323 -1.641559 1051 229 1.0000000
MYOGENESIS 0.0372656 0.3806018 0.4628162 1.504488 2396 225 1.0000000
CHOLESTEROL_HOMEOSTASIS 0.0385437 0.3806018 -0.5582494 -1.711517 1605 81 1.0000000
XENOBIOTIC_METABOLISM 0.0452206 0.3806018 -0.4664074 -1.587038 1616 223 1.0000000
EPITHELIAL_MESENCHYMAL_TRANSITION 0.0456722 0.3806018 0.4525985 1.467334 2927 219 1.0000000
COMPLEMENT 0.0633502 0.4525013 0.4321599 1.396831 4046 213 1.0000000
P53_PATHWAY 0.0756213 0.4726333 0.4187876 1.356431 4843 217 1.0000000
OXIDATIVE_PHOSPHORYLATION 0.0900322 0.5001786 -0.4158784 -1.414168 3219 222 1.0000000
MYC_TARGETS_V1 0.1318700 0.5696833 -0.3686211 -1.253166 4721 221 1.0000000

With cqn

Run cqn

# Check which genes in dgeFilt have GC content and length in gcGene
genes2keep_17cqn <- rownames(dgeFilt_17) %in% mcols(gcGene)$gene_id
dgeFilt_17cqn <- dgeFilt_17[genes2keep_17cqn,, keep.lib.sizes = FALSE]
cqn_17 <- cqn(
  dgeFilt_17cqn$counts, 
  x = dgeFilt_17cqn$genes$aveGc,
  lengths = dgeFilt_17cqn$genes$aveLength,
  sizeFactors = dgeFilt_17cqn$samples$lib.size
)

Differential expression

dgeFilt_17cqn$offset <- cqn_17$glm.offset
design_17cqn <- model.matrix(~ group, data = dgeFilt_17cqn$samples)
topTable_17cqn <- estimateGLMCommonDisp(dgeFilt_17cqn) %>%
  glmFit(design_17cqn) %>%
  glmLRT(coef=2) %>%
  topTags(n = Inf) %>%
  .$table %>%
  as_tibble() %>%
  unite("Range", start, end, sep = "-") %>%
  unite("Location", seqnames, Range, strand, sep = ":") %>%
  set_colnames(str_remove(colnames(.), "ID.")) %>%
  dplyr::select(
    Geneid = gene_id, 
    Symbol = gene_name,
    AveExpr = logCPM, 
    logFC, 
    P.Value = PValue, 
    FDR, 
    Location,
    aveLength,
    aveGc,
    maxLength,
    longestGc
  ) %>%
  mutate(
    DE = FDR < 0.05,
    RankStat = -sign(logFC)*log10(P.Value)
  )
if (!interactive()) {
  topTable_17cqn %>%
    dplyr::select(Geneid, Symbol, AveExpr, logFC, P.Value, FDR) %>%
    dplyr::slice(1:20) %>%
    mutate(
      P.Value = sprintf("%.2e", P.Value),
      FDR = sprintf("%.2e", FDR)
    ) %>%
    kable(caption = "The 20 most differentially expressed genes") %>%
    kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"))
}
The 20 most differentially expressed genes
Geneid Symbol AveExpr logFC P.Value FDR
ENSDARG00000080675 si:dkey-71b5.7 3.0884154 -3.160407 2.50e-69 4.63e-65
ENSDARG00000058371 krt5 4.6721904 2.476419 2.95e-64 2.73e-60
ENSDARG00000061585 cyp4v7 2.3274742 -3.192150 3.79e-56 2.34e-52
ENSDARG00000036830 krt91 3.9359598 2.139469 3.25e-46 1.51e-42
ENSDARG00000112702 BX537263.3 7.3995271 -1.881984 2.12e-44 7.85e-41
ENSDARG00000087290 si:ch211-202h22.10 0.6256874 5.176158 5.82e-41 1.80e-37
ENSDARG00000031952 mb 2.5176837 1.905665 7.35e-29 1.95e-25
ENSDARG00000036773 s100s 2.1466136 1.815182 5.26e-24 1.22e-20
ENSDARG00000093531 slc37a4b 0.4111232 2.925726 1.21e-21 2.49e-18
ENSDARG00000053480 aqp9b 1.7906742 1.768972 1.93e-20 3.58e-17
ENSDARG00000074653 si:ch211-233m11.1 2.6923190 -1.673444 2.18e-20 3.68e-17
ENSDARG00000091734 traf3ip2l 3.0851222 -1.358823 7.11e-18 1.10e-14
ENSDARG00000069093 col2a1a 2.6097279 1.381891 3.67e-17 5.24e-14
ENSDARG00000045367 tuba1b 6.6405206 -1.094861 1.21e-16 1.60e-13
ENSDARG00000096564 CU915827.1 1.2862540 1.716131 1.68e-16 2.08e-13
ENSDARG00000020581 otofb 2.0469346 1.440647 8.49e-16 9.83e-13
ENSDARG00000092604 si:ch211-15j1.4 4.7443741 1.064560 1.02e-14 1.12e-11
ENSDARG00000086819 scn1laa 2.0374641 1.280426 6.88e-13 7.08e-10
ENSDARG00000103494 usp43a 3.3189310 1.078191 7.42e-13 7.23e-10
ENSDARG00000105675 BX005417.3 0.3569806 1.876728 1.64e-12 1.52e-09
# Check similarity of genes
topTable_17cqn$Geneid[topTable_17cqn$DE == TRUE] %in%
  topTable_17$Geneid[topTable_17$DE == TRUE] %>% 
  table()
## .
## FALSE  TRUE 
##    29   188

GC content

topTable_17cqn %>%
  dplyr::arrange(desc(P.Value)) %>%
  dplyr::filter(!is.na(aveGc)) %>%
  ggplot(aes(aveGc, logFC)) +
  geom_point(aes(colour = DE), alpha = 0.5, show.legend = FALSE) +
  geom_smooth(se = FALSE) + 
  labs(
    x = "GC Content",
    title = "2017 data with cqn"
  ) +
  scale_color_manual(values = c("grey70", "red"))
Comparison of logFC and GC content after using cqn

Comparison of logFC and GC content after using cqn

topTable_17cqn %>%
  dplyr::arrange(desc(P.Value)) %>%
  dplyr::filter(!is.na(aveGc)) %>%
  ggplot(aes(aveGc, RankStat)) +
  geom_point(aes(colour = DE), alpha = 0.5, show.legend = FALSE) +
  geom_smooth(se = FALSE) + 
  labs(
    x = "GC Content", 
    y = "Ranking Statistic",
    title = "2017 data with cqn"
  ) +
  coord_cartesian(ylim = c(-10, 10)) +
  scale_color_manual(values = c("grey70", "red"))
Comparison of ranking statistics and GC content after using cqn

Comparison of ranking statistics and GC content after using cqn

Gene length

topTable_17cqn %>%
  dplyr::arrange(desc(P.Value)) %>%
  dplyr::filter(!is.na(aveLength)) %>%
  ggplot(aes(aveLength, logFC)) +
  geom_point(aes(colour = DE), alpha = 0.5, show.legend = FALSE) +
  geom_smooth(se = FALSE) + 
  labs(
    x = "Gene length",
    title = "2017 data with cqn"
  ) +
  scale_color_manual(values = c("grey70", "red")) +
  scale_x_log10(labels = comma)
Comparison of logFC and GC content after using cqn

Comparison of logFC and GC content after using cqn

topTable_17cqn %>%
  dplyr::arrange(desc(P.Value)) %>%
  dplyr::filter(!is.na(aveLength)) %>%
  ggplot(aes(aveLength, RankStat)) +
  geom_point(aes(colour = DE), alpha = 0.5, show.legend = FALSE) +
  geom_smooth(se = FALSE) + 
  labs(
    x = "Gene Length", 
    y = "Ranking Statistic",
    title = "2017 data with cqn"
  ) +
  scale_color_manual(values = c("grey70", "red")) +
  coord_cartesian(ylim = c(-10, 10)) +
  scale_x_log10(labels = comma)
Comparison of ranking statistics and GC content after using cqn

Comparison of ranking statistics and GC content after using cqn

GSEA

# Create named vector of gene level statistics 
ranks_17cqn <- topTable_17cqn %>%
  dplyr::arrange(RankStat) %>%
  with(structure(RankStat, names = Geneid))
# Run GSEA
fgsea_17cqn <- fgsea(hallmark, ranks_17cqn, nperm=1e5) %>%
  as_tibble() %>%
  dplyr::rename(FDR = padj) %>%
  mutate(padj = p.adjust(pval, "bonferroni")) %>%
  dplyr::arrange(pval)
if (!interactive()) {
  fgsea_17cqn %>%
    dplyr::select(-leadingEdge) %>%
    dplyr::slice(1:10) %>%
    kable(caption = "The 10 most significantly enriched pathways") %>%
    kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"))
}
The 10 most significantly enriched pathways
pathway pval FDR ES NES nMoreExtreme size padj
ANGIOGENESIS 0.0108945 0.2749676 0.7558817 1.927492 616 33 0.5447258
MYOGENESIS 0.0132930 0.2749676 0.5050005 1.653177 900 225 0.6646503
EPITHELIAL_MESENCHYMAL_TRANSITION 0.0203707 0.2749676 0.4901243 1.600737 1376 219 1.0000000
XENOBIOTIC_METABOLISM 0.0323571 0.2749676 -0.4798916 -1.669441 1042 223 1.0000000
CHOLESTEROL_HOMEOSTASIS 0.0329066 0.2749676 -0.5533618 -1.738633 1279 81 1.0000000
MTORC1_SIGNALING 0.0329961 0.2749676 -0.4751843 -1.657984 1059 229 1.0000000
COMPLEMENT 0.0465185 0.3322753 0.4454749 1.450777 3131 213 1.0000000
P53_PATHWAY 0.0569722 0.3560760 0.4313166 1.407191 3842 217 1.0000000
INFLAMMATORY_RESPONSE 0.0909277 0.4586587 -0.3958120 -1.346656 3111 172 1.0000000
OXIDATIVE_PHOSPHORYLATION 0.0917317 0.4586587 -0.4016592 -1.396654 2959 222 1.0000000

Comparisons (for viewing)

Differential expression

Before cqn: The 20 most differentially expressed genes
Geneid Symbol AveExpr logFC P.Value FDR
ENSDARG00000080675 si:dkey-71b5.7 3.0881824 -3.097472 2.28e-63 4.22e-59
ENSDARG00000058371 krt5 4.6741362 2.449728 8.45e-59 7.83e-55
ENSDARG00000112702 BX537263.3 7.3996536 -2.243755 1.88e-56 1.16e-52
ENSDARG00000061585 cyp4v7 2.3191654 -3.139064 5.07e-52 2.35e-48
ENSDARG00000087290 si:ch211-202h22.10 0.6296277 5.591881 2.94e-50 1.09e-46
ENSDARG00000036830 krt91 3.9384441 2.120057 1.22e-42 3.76e-39
ENSDARG00000074653 si:ch211-233m11.1 2.6910751 -2.277264 6.31e-36 1.67e-32
ENSDARG00000093531 slc37a4b 0.4139975 3.306116 9.71e-28 2.25e-24
ENSDARG00000031952 mb 2.5216783 1.889320 4.65e-27 9.57e-24
ENSDARG00000036773 s100s 2.1504571 1.812137 6.43e-23 1.19e-19
ENSDARG00000053480 aqp9b 1.7942745 1.755679 2.08e-19 3.51e-16
ENSDARG00000096564 CU915827.1 1.2875096 1.737070 2.47e-16 3.82e-13
ENSDARG00000045367 tuba1b 6.6404569 -1.126187 2.79e-16 3.98e-13
ENSDARG00000091734 traf3ip2l 3.0840975 -1.308960 7.46e-16 9.88e-13
ENSDARG00000069093 col2a1a 2.6124607 1.350510 1.10e-15 1.36e-12
ENSDARG00000020581 otofb 2.0502321 1.453107 2.21e-15 2.57e-12
ENSDARG00000095578 si:dkey-222h21.7 0.4110680 2.109993 4.60e-15 5.02e-12
ENSDARG00000092604 si:ch211-15j1.4 4.7446966 1.067221 7.73e-14 7.96e-11
ENSDARG00000101598 si:ch211-215p11.1 2.4986432 1.257070 1.49e-13 1.45e-10
ENSDARG00000096273 si:dkey-3n22.9 1.0162751 1.601951 7.98e-13 7.29e-10
After cqn: The 20 most differentially expressed genes
Geneid Symbol AveExpr logFC P.Value FDR
ENSDARG00000080675 si:dkey-71b5.7 3.0884154 -3.160407 2.50e-69 4.63e-65
ENSDARG00000058371 krt5 4.6721904 2.476419 2.95e-64 2.73e-60
ENSDARG00000061585 cyp4v7 2.3274742 -3.192150 3.79e-56 2.34e-52
ENSDARG00000036830 krt91 3.9359598 2.139469 3.25e-46 1.51e-42
ENSDARG00000112702 BX537263.3 7.3995271 -1.881984 2.12e-44 7.85e-41
ENSDARG00000087290 si:ch211-202h22.10 0.6256874 5.176158 5.82e-41 1.80e-37
ENSDARG00000031952 mb 2.5176837 1.905665 7.35e-29 1.95e-25
ENSDARG00000036773 s100s 2.1466136 1.815182 5.26e-24 1.22e-20
ENSDARG00000093531 slc37a4b 0.4111232 2.925726 1.21e-21 2.49e-18
ENSDARG00000053480 aqp9b 1.7906742 1.768972 1.93e-20 3.58e-17
ENSDARG00000074653 si:ch211-233m11.1 2.6923190 -1.673444 2.18e-20 3.68e-17
ENSDARG00000091734 traf3ip2l 3.0851222 -1.358823 7.11e-18 1.10e-14
ENSDARG00000069093 col2a1a 2.6097279 1.381891 3.67e-17 5.24e-14
ENSDARG00000045367 tuba1b 6.6405206 -1.094861 1.21e-16 1.60e-13
ENSDARG00000096564 CU915827.1 1.2862540 1.716131 1.68e-16 2.08e-13
ENSDARG00000020581 otofb 2.0469346 1.440647 8.49e-16 9.83e-13
ENSDARG00000092604 si:ch211-15j1.4 4.7443741 1.064560 1.02e-14 1.12e-11
ENSDARG00000086819 scn1laa 2.0374641 1.280426 6.88e-13 7.08e-10
ENSDARG00000103494 usp43a 3.3189310 1.078191 7.42e-13 7.23e-10
ENSDARG00000105675 BX005417.3 0.3569806 1.876728 1.64e-12 1.52e-09

GC Content

Gene length

GSEA

Before cqn: The 10 most significantly enriched pathways
pathway pval FDR ES NES nMoreExtreme size bonferroni
ANGIOGENESIS 0.0127794 0.3806018 0.7426339 1.888033 701 33 0.6389718
MTORC1_SIGNALING 0.0295921 0.3806018 -0.4814323 -1.641559 1051 229 1.0000000
MYOGENESIS 0.0372656 0.3806018 0.4628162 1.504488 2396 225 1.0000000
CHOLESTEROL_HOMEOSTASIS 0.0385437 0.3806018 -0.5582494 -1.711517 1605 81 1.0000000
XENOBIOTIC_METABOLISM 0.0452206 0.3806018 -0.4664074 -1.587038 1616 223 1.0000000
EPITHELIAL_MESENCHYMAL_TRANSITION 0.0456722 0.3806018 0.4525985 1.467334 2927 219 1.0000000
COMPLEMENT 0.0633502 0.4525013 0.4321599 1.396831 4046 213 1.0000000
P53_PATHWAY 0.0756213 0.4726333 0.4187876 1.356431 4843 217 1.0000000
OXIDATIVE_PHOSPHORYLATION 0.0900322 0.5001786 -0.4158784 -1.414168 3219 222 1.0000000
MYC_TARGETS_V1 0.1318700 0.5696833 -0.3686211 -1.253166 4721 221 1.0000000
After cqn: The 10 most significantly enriched pathways
pathway pval FDR ES NES nMoreExtreme size padj
ANGIOGENESIS 0.0108945 0.2749676 0.7558817 1.927492 616 33 0.5447258
MYOGENESIS 0.0132930 0.2749676 0.5050005 1.653177 900 225 0.6646503
EPITHELIAL_MESENCHYMAL_TRANSITION 0.0203707 0.2749676 0.4901243 1.600737 1376 219 1.0000000
XENOBIOTIC_METABOLISM 0.0323571 0.2749676 -0.4798916 -1.669441 1042 223 1.0000000
CHOLESTEROL_HOMEOSTASIS 0.0329066 0.2749676 -0.5533618 -1.738633 1279 81 1.0000000
MTORC1_SIGNALING 0.0329961 0.2749676 -0.4751843 -1.657984 1059 229 1.0000000
COMPLEMENT 0.0465185 0.3322753 0.4454749 1.450777 3131 213 1.0000000
P53_PATHWAY 0.0569722 0.3560760 0.4313166 1.407191 3842 217 1.0000000
INFLAMMATORY_RESPONSE 0.0909277 0.4586587 -0.3958120 -1.346656 3111 172 1.0000000
OXIDATIVE_PHOSPHORYLATION 0.0917317 0.4586587 -0.4016592 -1.396654 2959 222 1.0000000